pacman::p_load(ggplot2, dplyr, RMySQL, lubridate, psych, tidyr, plotly)
## Create a database connection
con = dbConnect(MySQL(), user='deepAnalytics', password='Sqltask1234!', dbname='dataanalytics2018', host='data-analytics-2018.cbrosir2cswx.us-east-1.rds.amazonaws.com')
## List the tables contained in the database
dbListTables(con)
## [1] "iris" "yr_2006" "yr_2007" "yr_2008" "yr_2009" "yr_2010"
#IRIS
#List the attributes in the table Iris
dbListFields(con,'iris')
## [1] "id" "SepalLengthCm" "SepalWidthCm" "PetalLengthCm"
## [5] "PetalWidthCm" "Species"
#Query from Iris -> Select all
irisALL <- dbGetQuery(con, "SELECT * FROM iris")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported
## as numeric
#Query from Iris -> Select two attributes
irisSELECT <- dbGetQuery(con, "SELECT SepalLengthCm, SepalWidthCm FROM iris")
#YR_2006
#List the attributes from yr_2006 table
dbListFields(con,'yr_2006')
## [1] "id" "Date"
## [3] "Time" "Global_active_power"
## [5] "Global_reactive_power" "Global_intensity"
## [7] "Voltage" "Sub_metering_1"
## [9] "Sub_metering_2" "Sub_metering_3"
#Query
#es posible hacer algo asi? vars <- c('Date', 'Time', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3')
yr_2006 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1, Sub_metering_2, Sub_metering_3 FROM yr_2006")
yr_2007 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1, Sub_metering_2, Sub_metering_3 FROM yr_2007")
yr_2008 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1, Sub_metering_2, Sub_metering_3 FROM yr_2008")
yr_2009 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1, Sub_metering_2, Sub_metering_3 FROM yr_2009")
yr_2010 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1, Sub_metering_2, Sub_metering_3 FROM yr_2010")
str(yr_2006)
## 'data.frame': 21992 obs. of 5 variables:
## $ Date : chr "2006-12-16" "2006-12-16" "2006-12-16" "2006-12-16" ...
## $ Time : chr "17:24:00" "17:25:00" "17:26:00" "17:27:00" ...
## $ Sub_metering_1: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2: num 1 1 2 1 1 2 1 1 1 2 ...
## $ Sub_metering_3: num 17 16 17 17 17 17 17 17 17 16 ...
str(yr_2007)
## 'data.frame': 521669 obs. of 5 variables:
## $ Date : chr "2007-01-01" "2007-01-01" "2007-01-01" "2007-01-01" ...
## $ Time : chr "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
## $ Sub_metering_1: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_3: num 0 0 0 0 0 0 0 0 0 0 ...
str(yr_2008)
## 'data.frame': 526905 obs. of 5 variables:
## $ Date : chr "2008-01-01" "2008-01-01" "2008-01-01" "2008-01-01" ...
## $ Time : chr "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
## $ Sub_metering_1: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_3: num 18 18 18 18 18 17 18 18 18 18 ...
str(yr_2009)
## 'data.frame': 521320 obs. of 5 variables:
## $ Date : chr "2009-01-01" "2009-01-01" "2009-01-01" "2009-01-01" ...
## $ Time : chr "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
## $ Sub_metering_1: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_3: num 0 0 0 0 0 0 0 0 0 0 ...
str(yr_2010)
## 'data.frame': 457394 obs. of 5 variables:
## $ Date : chr "2010-01-01" "2010-01-01" "2010-01-01" "2010-01-01" ...
## $ Time : chr "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
## $ Sub_metering_1: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_3: num 18 18 19 18 18 19 18 18 19 18 ...
summary(yr_2006)
## Date Time Sub_metering_1 Sub_metering_2
## Length:21992 Length:21992 Min. : 0.000 Min. : 0.000
## Class :character Class :character 1st Qu.: 0.000 1st Qu.: 0.000
## Mode :character Mode :character Median : 0.000 Median : 0.000
## Mean : 1.249 Mean : 2.215
## 3rd Qu.: 0.000 3rd Qu.: 1.000
## Max. :77.000 Max. :74.000
## Sub_metering_3
## Min. : 0.00
## 1st Qu.: 0.00
## Median : 0.00
## Mean : 7.41
## 3rd Qu.:17.00
## Max. :20.00
summary(yr_2007)
## Date Time Sub_metering_1 Sub_metering_2
## Length:521669 Length:521669 Min. : 0.000 Min. : 0.000
## Class :character Class :character 1st Qu.: 0.000 1st Qu.: 0.000
## Mode :character Mode :character Median : 0.000 Median : 0.000
## Mean : 1.232 Mean : 1.638
## 3rd Qu.: 0.000 3rd Qu.: 1.000
## Max. :78.000 Max. :78.000
## Sub_metering_3
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 5.795
## 3rd Qu.:17.000
## Max. :20.000
summary(yr_2008)
## Date Time Sub_metering_1 Sub_metering_2
## Length:526905 Length:526905 Min. : 0.00 Min. : 0.000
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.000
## Mode :character Mode :character Median : 0.00 Median : 0.000
## Mean : 1.11 Mean : 1.256
## 3rd Qu.: 0.00 3rd Qu.: 1.000
## Max. :80.00 Max. :76.000
## Sub_metering_3
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 1.000
## Mean : 6.034
## 3rd Qu.:17.000
## Max. :31.000
summary(yr_2009)
## Date Time Sub_metering_1 Sub_metering_2
## Length:521320 Length:521320 Min. : 0.000 Min. : 0.000
## Class :character Class :character 1st Qu.: 0.000 1st Qu.: 0.000
## Mode :character Mode :character Median : 0.000 Median : 0.000
## Mean : 1.137 Mean : 1.136
## 3rd Qu.: 0.000 3rd Qu.: 1.000
## Max. :82.000 Max. :77.000
## Sub_metering_3
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 1.000
## Mean : 6.823
## 3rd Qu.:18.000
## Max. :31.000
summary(yr_2010)
## Date Time Sub_metering_1 Sub_metering_2
## Length:457394 Length:457394 Min. : 0.0000 Min. : 0.000
## Class :character Class :character 1st Qu.: 0.0000 1st Qu.: 0.000
## Mode :character Mode :character Median : 0.0000 Median : 0.000
## Mean : 0.9875 Mean : 1.102
## 3rd Qu.: 0.0000 3rd Qu.: 1.000
## Max. :88.0000 Max. :80.000
## Sub_metering_3
## Min. : 0.000
## 1st Qu.: 1.000
## Median : 1.000
## Mean : 7.244
## 3rd Qu.:18.000
## Max. :31.000
head(yr_2006)
## Date Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 2006-12-16 17:24:00 0 1 17
## 2 2006-12-16 17:25:00 0 1 16
## 3 2006-12-16 17:26:00 0 2 17
## 4 2006-12-16 17:27:00 0 1 17
## 5 2006-12-16 17:28:00 0 1 17
## 6 2006-12-16 17:29:00 0 2 17
head(yr_2007)
## Date Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 2007-01-01 00:00:00 0 0 0
## 2 2007-01-01 00:01:00 0 0 0
## 3 2007-01-01 00:02:00 0 0 0
## 4 2007-01-01 00:03:00 0 0 0
## 5 2007-01-01 00:04:00 0 0 0
## 6 2007-01-01 00:05:00 0 0 0
head(yr_2008)
## Date Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 2008-01-01 00:00:00 0 0 18
## 2 2008-01-01 00:01:00 0 0 18
## 3 2008-01-01 00:02:00 0 0 18
## 4 2008-01-01 00:03:00 0 0 18
## 5 2008-01-01 00:04:00 0 0 18
## 6 2008-01-01 00:05:00 0 0 17
head(yr_2009)
## Date Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 2009-01-01 00:00:00 0 0 0
## 2 2009-01-01 00:01:00 0 0 0
## 3 2009-01-01 00:02:00 0 0 0
## 4 2009-01-01 00:03:00 0 0 0
## 5 2009-01-01 00:04:00 0 0 0
## 6 2009-01-01 00:05:00 0 0 0
head(yr_2010)
## Date Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 2010-01-01 00:00:00 0 0 18
## 2 2010-01-01 00:01:00 0 0 18
## 3 2010-01-01 00:02:00 0 0 19
## 4 2010-01-01 00:03:00 0 0 18
## 5 2010-01-01 00:04:00 0 0 18
## 6 2010-01-01 00:05:00 0 0 19
tail(yr_2006)
## Date Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 21987 2006-12-31 23:54:00 0 0 0
## 21988 2006-12-31 23:55:00 0 0 0
## 21989 2006-12-31 23:56:00 0 0 0
## 21990 2006-12-31 23:57:00 0 0 0
## 21991 2006-12-31 23:58:00 0 0 0
## 21992 2006-12-31 23:59:00 0 0 0
tail(yr_2007)
## Date Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 521664 2007-12-31 23:54:00 0 0 18
## 521665 2007-12-31 23:55:00 0 0 18
## 521666 2007-12-31 23:56:00 0 0 18
## 521667 2007-12-31 23:57:00 0 0 18
## 521668 2007-12-31 23:58:00 0 0 18
## 521669 2007-12-31 23:59:00 0 0 18
tail(yr_2008)
## Date Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 526900 2008-12-31 23:54:00 0 0 0
## 526901 2008-12-31 23:55:00 0 0 0
## 526902 2008-12-31 23:56:00 0 0 0
## 526903 2008-12-31 23:57:00 0 0 0
## 526904 2008-12-31 23:58:00 0 0 0
## 526905 2008-12-31 23:59:00 0 0 0
tail(yr_2009)
## Date Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 521315 2009-12-31 23:54:00 0 0 18
## 521316 2009-12-31 23:55:00 0 0 18
## 521317 2009-12-31 23:56:00 0 0 19
## 521318 2009-12-31 23:57:00 0 0 18
## 521319 2009-12-31 23:58:00 0 0 18
## 521320 2009-12-31 23:59:00 0 0 19
tail(yr_2010)
## Date Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 457389 2010-11-26 20:57:00 0 0 0
## 457390 2010-11-26 20:58:00 0 0 0
## 457391 2010-11-26 20:59:00 0 0 0
## 457392 2010-11-26 21:00:00 0 0 0
## 457393 2010-11-26 21:01:00 0 0 0
## 457394 2010-11-26 21:02:00 0 0 0
#Combine tables into one dataframe
All_Years <- bind_rows(yr_2007, yr_2008, yr_2009, yr_2010)
summary(All_Years)
## Date Time Sub_metering_1 Sub_metering_2
## Length:2027288 Length:2027288 Min. : 0.000 Min. : 0.000
## Class :character Class :character 1st Qu.: 0.000 1st Qu.: 0.000
## Mode :character Mode :character Median : 0.000 Median : 0.000
## Mean : 1.121 Mean : 1.289
## 3rd Qu.: 0.000 3rd Qu.: 1.000
## Max. :88.000 Max. :80.000
## Sub_metering_3
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 1.000
## Mean : 6.448
## 3rd Qu.:17.000
## Max. :31.000
#PREPROCESSING
## Combine Date and Time attribute values in a new attribute column with Paste
dataByYears <-cbind(All_Years,paste(All_Years$Date,All_Years$Time), stringsAsFactors=FALSE)
## Give the new attribute in the 6th column a header name change the name
colnames(dataByYears)[6] <-"DateTime"
## And move the DateTime attribute within the dataset
dataByYears <- dataByYears[,c(ncol(dataByYears), 1:(ncol(dataByYears)-1))]
head(dataByYears)
## DateTime Date Time Sub_metering_1 Sub_metering_2
## 1 2007-01-01 00:00:00 2007-01-01 00:00:00 0 0
## 2 2007-01-01 00:01:00 2007-01-01 00:01:00 0 0
## 3 2007-01-01 00:02:00 2007-01-01 00:02:00 0 0
## 4 2007-01-01 00:03:00 2007-01-01 00:03:00 0 0
## 5 2007-01-01 00:04:00 2007-01-01 00:04:00 0 0
## 6 2007-01-01 00:05:00 2007-01-01 00:05:00 0 0
## Sub_metering_3
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## Convert DateTime from POSIXlt to POSIXct
dataByYears$DateTime <- as.POSIXct(dataByYears$DateTime, "%Y/%m/%d %H:%M:%S")
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in strptime(x, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(as.POSIXlt(x, tz, ...), tz, ...): unknown
## timezone '%Y/%m/%d %H:%M:%S'
## Add the time zone
attr(dataByYears$DateTime, "tzone") <- "GMT+0"
## Inspect the data types
str(dataByYears)
## 'data.frame': 2027288 obs. of 6 variables:
## $ DateTime : POSIXct, format: "2007-01-01 00:00:00" "2007-01-01 00:01:00" ...
## $ Date : chr "2007-01-01" "2007-01-01" "2007-01-01" "2007-01-01" ...
## $ Time : chr "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
## $ Sub_metering_1: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_3: num 0 0 0 0 0 0 0 0 0 0 ...
#Separate daytime in different attributes
dataByYears$year <- year(dataByYears$DateTime)
dataByYears$month <- month(dataByYears$DateTime)
dataByYears$weekday <- weekdays(dataByYears$DateTime)
dataByYears$day <- day(dataByYears$DateTime)
dataByYears$hour <- hour(dataByYears$DateTime)
dataByYears$minute <- minute(dataByYears$DateTime)
#EXPLORATION OF THE DATA. First approach
summary(dataByYears)
## DateTime Date Time
## Min. :2007-01-01 00:00:00 Length:2027288 Length:2027288
## 1st Qu.:2007-12-21 16:32:45 Class :character Class :character
## Median :2008-12-07 16:38:30 Mode :character Mode :character
## Mean :2008-12-09 17:30:05
## 3rd Qu.:2009-11-27 16:09:15
## Max. :2010-11-26 21:02:00
## Sub_metering_1 Sub_metering_2 Sub_metering_3 year
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. :2007
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.:2007
## Median : 0.000 Median : 0.000 Median : 1.000 Median :2008
## Mean : 1.121 Mean : 1.289 Mean : 6.448 Mean :2008
## 3rd Qu.: 0.000 3rd Qu.: 1.000 3rd Qu.:17.000 3rd Qu.:2009
## Max. :88.000 Max. :80.000 Max. :31.000 Max. :2010
## month weekday day hour
## Min. : 1.000 Length:2027288 Min. : 1.00 Min. : 0.0
## 1st Qu.: 3.000 Class :character 1st Qu.: 8.00 1st Qu.: 5.0
## Median : 6.000 Mode :character Median :16.00 Median :12.0
## Mean : 6.394 Mean :15.62 Mean :11.5
## 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:18.0
## Max. :12.000 Max. :31.00 Max. :23.0
## minute
## Min. : 0.0
## 1st Qu.:15.0
## Median :30.0
## Mean :29.5
## 3rd Qu.:44.0
## Max. :59.0
describe(dataByYears) #from the psych package
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## vars n mean sd min max range se
## DateTime 1 2027288 NaN NA Inf -Inf -Inf NA
## Date 2 2027288 NaN NA Inf -Inf -Inf NA
## Time 3 2027288 NaN NA Inf -Inf -Inf NA
## Sub_metering_1 4 2027288 1.12 6.15 0 88 88 0.00
## Sub_metering_2 5 2027288 1.29 5.79 0 80 80 0.00
## Sub_metering_3 6 2027288 6.45 8.43 0 31 31 0.01
## year 7 2027288 2008.45 1.10 2007 2010 3 0.00
## month 8 2027288 6.39 3.39 1 12 11 0.00
## weekday 9 2027288 NaN NA Inf -Inf -Inf NA
## day 10 2027288 15.62 8.80 1 31 30 0.01
## hour 11 2027288 11.50 6.92 0 23 23 0.00
## minute 12 2027288 29.50 17.32 0 59 59 0.01
#QQNorm
qqnorm(dataByYears$Sub_metering_1)

qqnorm(dataByYears$Sub_metering_2)

qqnorm(dataByYears$Sub_metering_3)

#Look for NAs
summary(is.na(dataByYears))
## DateTime Date Time Sub_metering_1
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:2027288 FALSE:2027288 FALSE:2027288 FALSE:2027288
## Sub_metering_2 Sub_metering_3 year month
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:2027288 FALSE:2027288 FALSE:2027288 FALSE:2027288
## weekday day hour minute
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:2027288 FALSE:2027288 FALSE:2027288 FALSE:2027288
#Rename
#dataByYears %>% rename( Kitchen = Sub_metering_1, Laundry = Sub_metering_2, AC_Heater = Sub_metering_3) dplyer
names(dataByYears)[names(dataByYears) == "Sub_metering_1"] <- "Kitchen"
names(dataByYears)[names(dataByYears) == "Sub_metering_2"] <- "Laundry"
names(dataByYears)[names(dataByYears) == "Sub_metering_3"] <- "AC_Heater"
#GATHERING ALL SUB_METERINGS
sub_meterings_All <- dataByYears %>% gather(Sub_metering, Value, Kitchen:AC_Heater)
#VISUALIZATIONS. First approach
#1Var
#One Sub_metering at a time
#Histograms
hist(dataByYears$Kitchen)

hist(dataByYears$Laundry)

hist(dataByYears$AC_Heater)

#BoxPlots
boxplot(Kitchen~year,data=dataByYears)

boxplot(Laundry~year,data=dataByYears)

boxplot(AC_Heater~year,data=dataByYears)

#2Vars
#Sub_meterings and Years
#Boxplots
ggplot(sub_meterings_All, aes(Sub_metering, Value )) + geom_boxplot() + coord_flip() + facet_grid(.~year)

#Histograms
ggplot(sub_meterings_All, aes(Value, fill = Sub_metering)) + geom_histogram(alpha = 0.5, aes(y = ..density..), position = 'identity')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Density
ggplot(sub_meterings_All, aes(Value, fill = Sub_metering)) + geom_density(alpha = 0.2)

#qqPlot(lm(prestige ~ income + education + type, data=Duncan),envelope=.99)
#EXPLORATION OF THE DATA. Second approach
Values_n <- sub_meterings_All %>% group_by(Value) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
Values_not_0 <- sub_meterings_All %>% group_by(Value) %>% filter(Value!=0)
Values_not_0_n <- Values_not_0 %>% group_by(Value) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
Values_not_0_1 <- sub_meterings_All %>% group_by(Value) %>% filter(Value>1)
Values_not_0_1_n <- Values_not_0_1 %>% group_by(Value) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
Values_not_0_1_n
## # A tibble: 86 x 3
## Value n freq
## <dbl> <int> <dbl>
## 1 2 176481 0.171
## 2 3 9505 0.00922
## 3 4 7988 0.00775
## 4 5 6562 0.00637
## 5 6 3812 0.00370
## 6 7 3080 0.00299
## 7 8 2782 0.00270
## 8 9 2901 0.00282
## 9 10 4215 0.00409
## 10 11 12128 0.0118
## # … with 76 more rows
#VISUALIZATIONS. Second approach
#2Vars
#Sub_meterings and Years
#Boxplots
ggplot(Values_not_0, aes(Sub_metering, Value )) + geom_boxplot() + coord_flip() + facet_grid(.~year)

#Histograms
ggplot(Values_not_0, aes(Value, fill = Sub_metering)) + geom_histogram(alpha = 0.5, aes(y = ..density..), position = 'identity')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Density
ggplot(Values_not_0, aes(Value, fill = Sub_metering)) + geom_density(alpha = 0.2)

#VISUALIZATIONS. Third approach
#2Vars
#Sub_meterings and Years
#Boxplots
ggplot(Values_not_0_1, aes(Sub_metering, Value )) + geom_boxplot() + coord_flip() + facet_grid(.~year)

#Histograms
ggplot_0_50_non_0_1 <- ggplot(Values_not_0_1, aes(Value, fill = Sub_metering)) + geom_histogram(bin=50, stat="count")
## Warning: Ignoring unknown parameters: binwidth, bins, pad, bin
ggplot_0_50_non_0_1 %>% ggplotly()
#convert as factor?
Values_not_0_1$Value = as.factor(Values_not_0_1$Value)
#Density
ggplot(Values_not_0_1, aes(Value, fill = Sub_metering)) + geom_density(alpha = 0.2)

#remove scientific notation in r
options(scipen=999)
#scale x breaks, xlim
#PEAKS
#I found that there is a peak in AC_Heater in 18 watts/hour. Also 17 and 19 are hight, so that might correspond to the oscillation of watts in the turning on and off.
#